from scipy.stats import bernoulli, poisson, binom, norm, mvn, hypergeom
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
import pandas as pd

import plotly.express as px


headimg = plt.imread('data/quarterheads.jpg')
tailimg = plt.imread('data/quartertails.jpg')


theta = 0.5
# let us draw a sample from a bernoulli distribution
b = bernoulli.rvs(theta,size=1)
print(b)
if b[0] == 0:
    plt.imshow(tailimg)
    plt.axis('off')
else:
    plt.imshow(headimg)
    plt.axis('off')

[0]


# you can also draw samples simultaneously
theta = 0.9
samples = bernoulli.rvs(theta,size=1000)
print(samples)
# count the number of successes (sample = 1). What happens when you change p?
print(np.count_nonzero(samples==1))

[1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1
 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 0 1
 1 0 1 0 1 0 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0 1
 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1
 1 1 1 0 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 0
 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1
 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1
 0 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 1 1
 1 1 1 1 1 1 1 1 0 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 0 1 1 0
 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 1 1
 1]
889


bernoulli.pmf(a, theta)

array([0.3, 0.7])


# plotting the probability mass function for the Bernoulli distribution
a = np.arange(2) # domain of the bernoulli variable
THETAS = [0.1,0.2,0.6,0.8] # different parameters

df = pd.DataFrame([bernoulli.pmf(a, theta) for theta in THETAS], columns=['tails','heads'])
df = df.assign(theta=THETAS)
melted_df = pd.melt(df, id_vars="theta")
melted_df


fig = px.bar(melted_df, x="variable", y="value", 
             color="theta", 
             facet_col="theta",
             title = "Bernoulli Probability",
             labels = {"variable" : "Heads or Tails<br>(0 or 1)",
                       "value" : "Probability"},
            height=400,
            width=800)
fig.show()


colors = ['r','g','y','b']
plt.figure(figsize=(12,5))
for i, theta in enumerate([0.1, 0.2, 0.6, 0.7]):
    ax = plt.subplot(1, 4, i+1)
    plt.bar(a, bernoulli.pmf(a, theta), label=theta, color=colors[i], alpha=0.2)
    ax.xaxis.set_ticks(a)

    plt.legend(loc=0)
    if i == 0:
        plt.ylabel("PDF at $k$")
    

plt.suptitle("Bernoulli probability")

Text(0.5, 0.98, 'Bernoulli probability')


#sampling from a binomial distribution
sample = binom.rvs(20,0.9,1)
print(sample)

17


#plotting the pmf for a bernoulli distribution
plt.figure(figsize=(12,6))
k = np.arange(0, 22)
for p, color in zip([0.1, 0.3, 0.6, 0.8], colors):
    rv = binom(20, p)
    plt.plot(k, rv.pmf(k), lw=2, color=color, label=p)
    plt.fill_between(k, rv.pmf(k), color=color, alpha=0.3)
plt.legend()
plt.title("Binomial distribution")
plt.tight_layout()
plt.ylabel("PDF at $k$")
plt.xlabel("$k$")

Text(0.5, 33.0, '$k$')


all_dfs = []
for lambd in [10, 25,50, 100]:
    rv = poisson(lambd)
    # calculate the pmf for different values of k and plot
    k = np.arange(200)
    all_dfs.append(pd.DataFrame(zip(k, rv.pmf(k), [lambd]*len(k)), 
                                columns=['count','probability','lambda'])
                  )

df = pd.concat(all_dfs)

fig = px.bar(df, x="count",y="probability",facet_col='lambda', 
                    facet_col_wrap=2,
             height=600, width=800)
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.show()


# generating samples from this distribution
samples = rv.rvs(1000)
h = plt.hist(samples,bins=20,density=True)
plt.xlim([0,100])

(0.0, 100.0)


[M, n, N] = [52, 4, 5]
rv = hypergeom(M, n, N)
x = np.arange(0, n+1)
pmf_aces = rv.pmf(x)
[f'Prob of {i} aces: {j:.5f}'.format(i,j) for i,j in enumerate(pmf_aces)]

['Prob of 0 aces: 0.65884',
 'Prob of 1 aces: 0.29947',
 'Prob of 2 aces: 0.03993',
 'Prob of 3 aces: 0.00174',
 'Prob of 4 aces: 0.00002']


# set the parameters
mu = 125
sigma = 8
# draw 1000 samples from this distribution
#samples = norm(mu, sigma).rvs(1000)
# plot an empirical distribution, i.e., a histogram
#h = plt.hist(samples, 30, density=True, alpha=.3)

# Compute the density at several instances of the random variable
x = np.linspace(90, 170, 10001)
# plot the density
plt.plot(x, norm(mu, sigma).pdf(x), linewidth=2)

[<matplotlib.lines.Line2D at 0x7fdb70c0ad30>]


hw = pd.read_csv('data/heightweight.csv')


fig = px.histogram(hw, x="Weight")
fig.show()


[1/6.]*6


# generate samples from a multinoulli distribution. Essentially simulated a single roll of dice. Note that the output is a vector of length $k = 6$
np.random.multinomial(1, [1/6.]*6, size=1)


# generate samples from a multinomial distribution. Note that the output is a vector of length $k = 6$
np.random.multinomial(20, [1/6.]*6, size=1)


#define the parameters for D = 2
mu = np.array([10,10])
Sigma = np.array([[4,1.],[1.,1]])
rv = np.random.multivariate_normal(mu,Sigma)
#sample some points
s = np.random.multivariate_normal(mu,Sigma,1000)

fig = plt.figure()
plt.subplot(111)
plt.scatter(s[:,0],s[:,1])

# add a contour plot
smin = np.min(s,axis=0)
smax = np.max(s,axis=0)
t1=np.linspace(smin[0],smax[0],1000)
t2=np.linspace(smin[1],smax[1],1000)

# evaluate pdf at each of these mesh points


np.cov(s.transpose())

Lecture 2, CSE 474/574¶

Probability¶

Basic Definitions¶

Definition of a probability function¶

Example: Coin tossing¶

Conditional Probability¶

Random Variables¶

Distribution functions of a random variable¶

CDF¶

PMF¶

PDF¶

Continuous vs. Discrete¶

Identically distributed RVs¶

Expected Value of a (Function of a) Random Variable¶

Variance of a Random Variable¶

Probability distributions for univariate models (One random variable)¶

Probability distributions for discrete random variables¶

Bernoulli distribution¶

Binomial distribution¶

Poisson distribution¶

Hypergeometric¶

Continuous random variables¶

Gaussian distribution¶

Real-world example¶

Multiple Random Variables (Multivariate models)¶

Joint PMF¶

Joint PDF¶

Expectations of multiple random variables¶

The marginal distribution¶

Conditional distributions¶

Independence of variables¶

Covariance and Correlation¶

Some multivariate probability distributions¶

Multinoulli distribution¶

Multinomial distribution¶

Multi-dimensional or multivariate Gaussian distribution¶

	theta	variable	value
0	0.1	tails	0.9
1	0.2	tails	0.8
2	0.6	tails	0.4
3	0.8	tails	0.2
4	0.1	heads	0.1
5	0.2	heads	0.2
6	0.6	heads	0.6
7	0.8	heads	0.8